Data Visualization#
Once upon a time there were plots upon plots upon plots.
Load data#
Show code cell source
import pandas as pd
import sys
sys.path.append('../')
from source.bokeh_plots import *
from source.data_visualization import *
output_notebook()
file_path = '../data/al_atlas_main_results.xlsx'
model_name = 'AML Epigenomic Risk'
# Read the data
df = pd.read_excel(file_path, index_col=0).sort_index()
# Define train and test samples
df_train = df[df['Train-Test']=='Train Sample']
df_test = df[df['Train-Test'] == 'Test Sample']
# Drop the samples with missing labels for the selected column
df_px = df_train[~df_train['Vital Status'].isna()]
# drop the samples with missing labels for the ELN AML 2022 Diagnosis
df_dx = df_train[~df_train['WHO 2022 Diagnosis'].isna()]
# exclude the classes with fewer than 10 samples
df_dx = df_dx[~df_dx['WHO 2022 Diagnosis'].isin([
'MPAL with t(v;11q23.3)/KMT2A-r',
'B-ALL with hypodiploidy',
'AML with t(16;21); FUS::ERG',
'AML with t(9;22); BCR::ABL1'
])]
### Select samples from COG AAML1031, 0531, and 03P1 Dx samples
df_cog = df[df['Clinical Trial'].isin(['AAML0531', 'AAML1031', 'AAML03P1'])]
df_cog = df_cog[df_cog['Sample Type'].isin(['Diagnosis', 'Primary Blood Derived Cancer - Bone Marrow',
'Primary Blood Derived Cancer - Peripheral Blood'])]
df_cog = df_cog[~df_cog['Patient_ID'].duplicated(keep='last')]
Interactive atlas#
Show code cell source
plot_linked_scatters(df)
Patient Characteristics#
Foundation (unsupervised) model#
Show code cell source
from tableone import TableOne
from datetime import date
columns = ['Hematopoietic Entity','Age (group years)','Sex',
'Clinical Trial',]
mytable_cog = TableOne(df_train.reset_index(), columns,
overall=False, missing=True,
pval=False, pval_adjust=False,
htest_name=True,dip_test=True,
tukey_test=True, normal_test=True,
order={'FLT3 ITD':['Yes','No'],
'Age (group years)':['0-5','5-13','13-39','39-60'],
'MRD 1 Status': ['Positive'],
'Risk Group': ['High Risk', 'Standard Risk'],
'FLT3 ITD': ['Yes'],
'Leucocyte counts (10⁹/L)': ['≥30'],
'Age group (years)': ['≥10']})
mytable_cog.to_excel('../data/pt_characteristics_foundation_model_' + str(date.today()) +'.xlsx')
mytable_cog.tabulate(tablefmt="html",
# headers=[score_name,"",'Missing','Discovery','Validation','p-value','Statistical Test']
)
Show code cell output
| Missing | Overall | ||
|---|---|---|---|
| n | 3308 | ||
| Hematopoietic Entity, n (%) | Acute lymphoblastic leukemia (ALL) | 844 | 700 (28.4) |
| Acute myeloid leukemia (AML) | 1207 (49.0) | ||
| Acute promyelocytic leukemia (APL) | 31 (1.3) | ||
| Mixed phenotype acute leukemia (MPAL) | 50 (2.0) | ||
| Myelodysplastic syndrome (MDS or MDS-like) | 225 (9.1) | ||
| Otherwise-Normal (Control) | 251 (10.2) | ||
| Age (group years), n (%) | 0-5 | 1320 | 480 (24.1) |
| 5-13 | 482 (24.2) | ||
| 13-39 | 658 (33.1) | ||
| 39-60 | 165 (8.3) | ||
| 60+ | 203 (10.2) | ||
| Sex, n (%) | Female | 1511 | 883 (49.1) |
| Male | 914 (50.9) | ||
| Clinical Trial, n (%) | AAML03P1 | 41 | 72 (2.2) |
| AAML0531 | 628 (19.2) | ||
| AAML1031 | 581 (17.8) | ||
| Beat AML Consortium | 316 (9.7) | ||
| CCG2961 | 41 (1.3) | ||
| CETLAM SMD-09 (MDS-tAML) | 166 (5.1) | ||
| French GRAALL 2003–2005 | 141 (4.3) | ||
| Japanese AML05 | 64 (2.0) | ||
| NOPHO ALL92-2000 | 933 (28.6) | ||
| TARGET ALL | 131 (4.0) | ||
| TCGA AML | 194 (5.9) |
Fine-tuned (supervised) models#
Show code cell source
columns = ['Age (years)','Age group (years)','Sex','Race or ethnic group',
'Hispanic or Latino ethnic group', 'MRD 1 Status',
'Leucocyte counts (10⁹/L)', 'BM leukemic blasts (%)',
'Risk Group','FLT3 ITD', 'Clinical Trial']
df_test['Age (years)'] = df_test['Age (years)'].astype(float)
# join discovery clinical data with validation clinical data
all_cohorts = pd.concat([df_dx, df_px, df_test],
axis=0, keys=['AL Epigenomic Phenotype','AML Epigenomic Risk' ,'Validation'],
names=['cohort']).reset_index()
# columns = ['Age group (years)','Sex', 'MRD 1 Status',
# 'Leucocyte counts (10⁹/L)',
# 'Risk Group','FLT3 ITD', 'Treatment Arm','Clinical Trial']
mytable_cog = TableOne(all_cohorts, columns,
overall=False, missing=False,
pval=False, pval_adjust=False,
htest_name=True,dip_test=True,
tukey_test=True, normal_test=True,
order={'FLT3 ITD':['Yes','No'],
'Race or ethnic group':['White','Black or African American','Asian'],
'MRD 1 Status': ['Positive'],
'Risk Group': ['High Risk', 'Standard Risk'],
'FLT3 ITD': ['Yes'],
'Leucocyte counts (10⁹/L)': ['≥30'],
'Age group (years)': ['≥10']},
groupby='cohort')
mytable_cog.to_excel('../data/pt_characteristics_fine-tuned_models_' + str(date.today()) +'.xlsx')
mytable_cog.tabulate(tablefmt="html",
# headers=[score_name,"",score_name,'Validation','p-value','Statistical Test']
)
Show code cell output
| AL Epigenomic Phenotype | AML Epigenomic Risk | Validation | ||
|---|---|---|---|---|
| n | 2445 | 1844 | 201 | |
| Age (years), mean (SD) | 19.3 (19.8) | 19.5 (21.4) | 8.8 (6.0) | |
| Age group (years), n (%) | ≥10 | 520 (47.2) | 644 (48.2) | 95 (47.7) |
| <10 | 581 (52.8) | 693 (51.8) | 104 (52.3) | |
| Sex, n (%) | Female | 702 (50.4) | 853 (49.2) | 87 (43.3) |
| Male | 691 (49.6) | 879 (50.8) | 114 (56.7) | |
| Race or ethnic group, n (%) | White | 1052 (80.4) | 1302 (80.4) | 143 (71.9) |
| Black or African American | 131 (10.0) | 155 (9.6) | 32 (16.1) | |
| Asian | 65 (5.0) | 87 (5.4) | 1 (0.5) | |
| American Indian or Alaska Native | 7 (0.5) | 8 (0.5) | ||
| Native Hawaiian or other Pacific Islander | 7 (0.5) | 10 (0.6) | 2 (1.0) | |
| Other | 46 (3.5) | 57 (3.5) | 21 (10.6) | |
| Hispanic or Latino ethnic group, n (%) | Hispanic or Latino | 204 (19.3) | 245 (19.0) | 25 (12.6) |
| Not Hispanic or Latino | 851 (80.7) | 1044 (81.0) | 174 (87.4) | |
| MRD 1 Status, n (%) | Positive | 282 (29.7) | 361 (31.4) | 76 (40.2) |
| Negative | 667 (70.3) | 787 (68.6) | 113 (59.8) | |
| Leucocyte counts (10⁹/L), n (%) | ≥30 | 572 (52.4) | 646 (48.9) | 88 (44.0) |
| <30 | 520 (47.6) | 676 (51.1) | 112 (56.0) | |
| BM leukemic blasts (%), mean (SD) | 65.8 (24.1) | 65.1 (24.2) | 60.0 (25.6) | |
| Risk Group, n (%) | High Risk | 195 (14.1) | 299 (17.5) | 51 (25.4) |
| Standard Risk | 620 (44.9) | 849 (49.7) | 87 (43.3) | |
| Low Risk | 566 (41.0) | 561 (32.8) | 63 (31.3) | |
| FLT3 ITD, n (%) | Yes | 179 (16.3) | 248 (18.6) | 31 (15.6) |
| No | 920 (83.7) | 1087 (81.4) | 168 (84.4) | |
| Clinical Trial, n (%) | AAML03P1 | 62 (2.6) | 72 (4.0) | |
| AAML0531 | 510 (21.2) | 628 (34.8) | ||
| AAML1031 | 489 (20.3) | 581 (32.2) | ||
| Beat AML Consortium | 192 (8.0) | 225 (12.5) | ||
| CCG2961 | 31 (1.3) | 41 (2.3) | ||
| CETLAM SMD-09 (MDS-tAML) | 166 (6.9) | |||
| French GRAALL 2003–2005 | 141 (5.9) | |||
| Japanese AML05 | 9 (0.4) | 15 (0.8) | ||
| NOPHO ALL92-2000 | 636 (26.5) | |||
| TARGET ALL | 50 (2.1) | 47 (2.6) | ||
| TCGA AML | 118 (4.9) | 194 (10.8) | ||
| AML02 | 159 (79.1) | |||
| AML08 | 42 (20.9) |
By prognostic group#
Discovery#
Show code cell source
def pt_characteristics_by_model(df, model_name, traintest = 'discovery'):
columns = ['Age (years)','Age group (years)','Sex','Race or ethnic group',
'Hispanic or Latino ethnic group', 'MRD 1 Status',
'Leucocyte counts (10⁹/L)', 'BM leukemic blasts (%)',
'Risk Group', 'Clinical Trial','FLT3 ITD', 'Treatment Arm']
mytable_cog = TableOne(df, columns,
overall=False, missing=True,
pval=True, pval_adjust=False,
htest_name=True,dip_test=True,
tukey_test=True, normal_test=True,
order={'FLT3 ITD':['Yes','No'],
'Race or ethnic group':['White','Black or African American','Asian'],
'MRD 1 Status': ['Positive'],
'Risk Group': ['High Risk', 'Standard Risk'],
'FLT3 ITD': ['Yes'],
'Leucocyte counts (10⁹/L)': ['≥30'],
'Age group (years)': ['≥10']},
groupby=model_name)
mytable_cog.to_excel('../data/pt_characteristics_'+ model_name +'_' + traintest + '_' + str(date.today()) + '.xlsx')
return(mytable_cog.tabulate(tablefmt="html",
headers=[model_name + ' ' + traintest,"",'Missing','High','Low','p-value','Statistical Test']))
pt_characteristics_by_model(df_px, model_name, 'discovery')
Show code cell output
| AML Epigenomic Risk discovery | Missing | High | Low | p-value | Statistical Test | |
|---|---|---|---|---|---|---|
| n | 810 | 1034 | ||||
| Age (years), mean (SD) | 65 | 23.0 (24.7) | 16.6 (17.9) | <0.001 | Two Sample T-test | |
| Age group (years), n (%) | ≥10 | 507 | 278 (48.9) | 366 (47.7) | 0.704 | Chi-squared |
| <10 | 291 (51.1) | 402 (52.3) | ||||
| Sex, n (%) | Female | 112 | 363 (46.5) | 490 (51.5) | 0.046 | Chi-squared |
| Male | 417 (53.5) | 462 (48.5) | ||||
| Race or ethnic group, n (%) | White | 225 | 600 (81.1) | 702 (79.9) | 0.077 | Chi-squared (warning: expected count < 5) |
| Black or African American | 71 (9.6) | 84 (9.6) | ||||
| Asian | 46 (6.2) | 41 (4.7) | ||||
| American Indian or Alaska Native | 1 (0.1) | 7 (0.8) | ||||
| Native Hawaiian or other Pacific Islander | 3 (0.4) | 7 (0.8) | ||||
| Other | 19 (2.6) | 38 (4.3) | ||||
| Hispanic or Latino ethnic group, n (%) | Hispanic or Latino | 555 | 93 (17.0) | 152 (20.5) | 0.140 | Chi-squared |
| Not Hispanic or Latino | 453 (83.0) | 591 (79.5) | ||||
| MRD 1 Status, n (%) | Positive | 696 | 207 (43.3) | 154 (23.0) | <0.001 | Chi-squared |
| Negative | 271 (56.7) | 516 (77.0) | ||||
| Leucocyte counts (10⁹/L), n (%) | ≥30 | 522 | 262 (46.9) | 384 (50.3) | 0.235 | Chi-squared |
| <30 | 297 (53.1) | 379 (49.7) | ||||
| BM leukemic blasts (%), mean (SD) | 236 | 67.2 (24.6) | 63.4 (23.8) | 0.002 | Two Sample T-test | |
| Risk Group, n (%) | High Risk | 135 | 212 (27.6) | 87 (9.2) | <0.001 | Chi-squared |
| Standard Risk | 487 (63.5) | 362 (38.4) | ||||
| Low Risk | 68 (8.9) | 493 (52.3) | ||||
| Clinical Trial, n (%) | AAML03P1 | 41 | 41 (5.1) | 31 (3.1) | <0.001 | Chi-squared |
| AAML0531 | 249 (30.7) | 379 (38.2) | ||||
| AAML1031 | 242 (29.9) | 339 (34.1) | ||||
| Beat AML Consortium | 120 (14.8) | 105 (10.6) | ||||
| CCG2961 | 27 (3.3) | 14 (1.4) | ||||
| Japanese AML05 | 10 (1.2) | 5 (0.5) | ||||
| TARGET ALL | 14 (1.7) | 33 (3.3) | ||||
| TCGA AML | 107 (13.2) | 87 (8.8) | ||||
| FLT3 ITD, n (%) | Yes | 509 | 127 (22.4) | 121 (15.8) | 0.003 | Chi-squared |
| No | 441 (77.6) | 646 (84.2) | ||||
| Treatment Arm, n (%) | Arm A | 1146 | 121 (41.7) | 189 (46.3) | 0.259 | Chi-squared |
| Arm B | 169 (58.3) | 219 (53.7) |
Validation#
Show code cell source
pt_characteristics_by_model(df_test, model_name, 'validation')
Show code cell output
| AML Epigenomic Risk validation | Missing | High | Low | p-value | Statistical Test | |
|---|---|---|---|---|---|---|
| n | 75 | 126 | ||||
| Age (years), mean (SD) | 2 | 9.3 (6.0) | 8.5 (6.0) | 0.344 | Two Sample T-test | |
| Age group (years), n (%) | ≥10 | 2 | 38 (51.4) | 57 (45.6) | 0.523 | Chi-squared |
| <10 | 36 (48.6) | 68 (54.4) | ||||
| Sex, n (%) | Female | 0 | 32 (42.7) | 55 (43.7) | 1.000 | Chi-squared |
| Male | 43 (57.3) | 71 (56.3) | ||||
| Race or ethnic group, n (%) | White | 2 | 53 (72.6) | 90 (71.4) | 0.724 | Chi-squared (warning: expected count < 5) |
| Black or African American | 11 (15.1) | 21 (16.7) | ||||
| Asian | 1 (1.4) | |||||
| Native Hawaiian or other Pacific Islander | 1 (1.4) | 1 (0.8) | ||||
| Other | 7 (9.6) | 14 (11.1) | ||||
| Hispanic or Latino ethnic group, n (%) | Hispanic or Latino | 2 | 11 (14.9) | 14 (11.2) | 0.594 | Chi-squared |
| Not Hispanic or Latino | 63 (85.1) | 111 (88.8) | ||||
| MRD 1 Status, n (%) | Positive | 12 | 37 (51.4) | 39 (33.3) | 0.021 | Chi-squared |
| Negative | 35 (48.6) | 78 (66.7) | ||||
| Leucocyte counts (10⁹/L), n (%) | ≥30 | 1 | 28 (37.8) | 60 (47.6) | 0.231 | Chi-squared |
| <30 | 46 (62.2) | 66 (52.4) | ||||
| BM leukemic blasts (%), mean (SD) | 21 | 65.1 (27.5) | 57.1 (24.1) | 0.051 | Two Sample T-test | |
| Risk Group, n (%) | High Risk | 0 | 28 (37.3) | 23 (18.3) | <0.001 | Chi-squared |
| Standard Risk | 38 (50.7) | 49 (38.9) | ||||
| Low Risk | 9 (12.0) | 54 (42.9) | ||||
| Clinical Trial, n (%) | AML02 | 0 | 61 (81.3) | 98 (77.8) | 0.674 | Chi-squared |
| AML08 | 14 (18.7) | 28 (22.2) | ||||
| FLT3 ITD, n (%) | Yes | 2 | 14 (18.9) | 17 (13.6) | 0.425 | Chi-squared |
| No | 60 (81.1) | 108 (86.4) | ||||
| Treatment Arm, n (%) | Arm A | 2 | 41 (56.2) | 66 (52.4) | 0.713 | Chi-squared |
| Arm B | 32 (43.8) | 60 (47.6) |
Kaplan-Meier Plots#
Overall study population#
Show code cell source
for dataset, trial in zip([df_cog, df_test],
['COG AML trials', 'Validation cohort']):
draw_kaplan_meier(model_name=model_name,
df=dataset,
save_survival_table=False,
save_plot=False,
show_ci=False,
add_risk_counts=False,
trialname=trial,
figsize=(8,8))
Show code cell output
Per risk group#
Show code cell source
for dataset, trial in zip([df_cog, df_test], ['COG AML trials', 'Validation cohort']):
risk_groups = ['High Risk', 'Low Risk', 'Standard Risk']
for risk_group in risk_groups:
draw_kaplan_meier(
model_name=model_name,
df=dataset[dataset['Risk Group'] == risk_group],
save_plot=False,
save_survival_table=False,
add_risk_counts=False,
trialname=f'{trial} {risk_group}',
figsize=(8, 8))
Show code cell output
Per risk group (AAML1831 COG)#
Show code cell source
for dataset, trial in zip([df_cog],['COG AML trials']):
risk_groups = ['High', 'Low', 'Standard']
for risk_group in risk_groups:
draw_kaplan_meier(
model_name=model_name,
df=dataset[dataset['Risk Group AAML1831'] == risk_group],
save_plot=False,
save_survival_table=False,
add_risk_counts=False,
trialname=f'{trial} {risk_group} Risk',
figsize=(8, 8))
Show code cell output
Forest Plots#
With MRD 1#
Show code cell source
for dataset, trial in zip([df_cog, df_test], ['COG AML trials', 'Validation cohort']):
df_ = dataset.copy()
df_['AML_Epigenomic_Risk'] = df_['AML Epigenomic Risk']
draw_forest_plot(time='os.time',
event='os.evnt',
df=df_,
trialname=trial,
model_name='AML_Epigenomic_Risk',
save_plot=False)
draw_forest_plot(time='efs.time',
event='efs.evnt',
df=df_,
trialname=trial,
model_name='AML_Epigenomic_Risk',
save_plot=False)
Show code cell output
With MRD 1 and BM blast (%)#
Show code cell source
for dataset, trial in zip([df_cog, df_test], ['COG AML trials', 'Validation cohort']):
df_ = dataset.copy()
df_['BM leukemic blasts (%)'] = pd.cut(df_['BM leukemic blasts (%)'], bins=[0,50,100], labels=['≤50', '>50'])
df_['AML_Epigenomic_Risk'] = df_['AML Epigenomic Risk']
draw_forest_plot_withBMblast(time='os.time',
event='os.evnt',
df=df_,
trialname=trial,
model_name='AML_Epigenomic_Risk',
save_plot=False)
draw_forest_plot_withBMblast(time='efs.time',
event='efs.evnt',
df=df_,
trialname=trial,
model_name='AML_Epigenomic_Risk',
save_plot=False)
Show code cell output
Without MRD 1#
Show code cell source
for dataset, trial in zip([df_cog, df_test], ['COG AML trials', 'Validation cohort']):
df_ = dataset.copy()
df_['BM leukemic blasts (%)'] = pd.cut(df_['BM leukemic blasts (%)'], bins=[0,50,100], labels=['≤50', '>50'])
df_['AML_Epigenomic_Risk'] = df_['AML Epigenomic Risk']
draw_forest_plot_noMRD(time='os.time',
event='os.evnt',
df=df_,
trialname=trial,
model_name='AML_Epigenomic_Risk',
save_plot=False)
draw_forest_plot_noMRD(time='efs.time',
event='efs.evnt',
df=df_,
trialname=trial,
model_name='AML_Epigenomic_Risk',
save_plot=False)
Show code cell output
ROC AUC performance#
Show code cell source
def plot_roc_auc(df, target, model_name, risk_group='Risk Group', title=None, sum_models=False):
"""
Plots ROC AUC flexibly using Bokeh.
Parameters:
- df: pandas DataFrame containing model predictions as columns and actual target variable.
- target: Name of the column containing the actual target variable.
- model_name: Name of the column containing the model predictions.
- risk_group: Name of the column containing the risk group.
- title: Title of the plot.
"""
def category_to_integer(df, model_name, risk_group=None, sum_models=sum_models):
df_ = df.copy()
low_high_dict = {'Low': 0, 'Low Risk': 0,
'Standard':0.5, 'Standard Risk': 0.5,
'High': 1, 'High Risk': 1}
if df[model_name].dtype == 'O':
df_[model_name] = df_[model_name].map(low_high_dict)
df_[risk_group] = df_[risk_group].map(low_high_dict)
if sum_models:
df_[model_name + ' + ' + risk_group] = (df_[model_name] + df_[risk_group])/2
df_ = df_[[model_name + ' + ' + risk_group, target]]
else:
df_ = df_[[model_name, risk_group, target]]
# drop rows with missing values
df_ = df_.dropna()
return df_
df = category_to_integer(df, model_name, risk_group=risk_group)
# colors = itertools.cycle(Spectral11)
colors = ['navy', 'firebrick', 'olive']
if title:
title_ = title + ', n=' + str(len(df))
else:
title_ = ''
p = figure(title=title_,
x_axis_label='False Positive Rate',
y_axis_label='True Positive Rate',
width=325, height=325,
tools='save,reset,pan')
p.line([0, 1], [0, 1], line_dash="dashed", color="gray", line_width=1)
for column, color in zip(df.columns.difference([target]), colors):
fpr, tpr, _ = roc_curve(df[target], df[column])
roc_auc = auc(fpr, tpr)
p.line(fpr, tpr, legend_label=f"{column}\nAUC = {roc_auc:.2f}",
color=color, line_width=2, alpha=0.8)
p.legend.location = "bottom_right"
p.legend.click_policy="hide"
p.toolbar.logo = None
p.legend.label_text_font_size = '8pt'
p.legend.spacing = 2
p.xaxis.axis_label_text_font_style = "normal"
p.yaxis.axis_label_text_font_style = "normal"
p.legend.background_fill_alpha = 0.8
p.title.text_font_size = '9pt'
return p
AML epigenomic risk (probability) + risk group#
Show code cell source
# Probability model
model_name = 'P(High Risk)'
p1 = plot_roc_auc(df_px, 'os.evnt', model_name , title='Discovery cohort')
p2 = plot_roc_auc(df_cog, 'os.evnt', model_name, title='Discovery COG peds AML Dx')
p3 = plot_roc_auc(df_test, 'os.evnt', model_name, title='Validation cohort')
p4 = plot_roc_auc(df_px, 'os.evnt', model_name , sum_models=True)
p5 = plot_roc_auc(df_cog, 'os.evnt', model_name, sum_models=True)
p6 = plot_roc_auc(df_test, 'os.evnt', model_name, sum_models=True)
# Create a gridplot
p = gridplot([
[p1, p2, p3,],
[p4, p5, p6,],
], toolbar_location='above')
show(p)
Show code cell output
Note
Sample size may be reduced in the ROC AUC because samples with missing risk group data were removed.
AML epigenomic risk (high-low) + risk group#
Show code cell source
# Binary model
model_name = 'AML Epigenomic Risk'
p1 = plot_roc_auc(df_px, 'os.evnt', model_name , title='Discovery cohort')
p2 = plot_roc_auc(df_cog, 'os.evnt', model_name, title='Discovery COG peds AML Dx')
p3 = plot_roc_auc(df_test, 'os.evnt', model_name, title='Validation cohort')
p4 = plot_roc_auc(df_px, 'os.evnt', model_name , sum_models=True)
p5 = plot_roc_auc(df_cog, 'os.evnt', model_name, sum_models=True)
p6 = plot_roc_auc(df_test, 'os.evnt', model_name, sum_models=True)
# Create a gridplot
p = gridplot([
[p1, p2, p3,],
[p4, p5, p6,],
], toolbar_location='above')
show(p)
Show code cell output
AML epigenomic risk + latest risk group (AAML1831 COG)#
Show code cell source
# Probability model
model_name = 'P(High Risk)'
p1 = plot_roc_auc(df_cog, 'os.evnt', model_name ,risk_group='Risk Group' ,title='Risk group AAML1031-0531')
p2 = plot_roc_auc(df_cog, 'os.evnt', model_name, risk_group='Risk Group AAML1831' ,title='Risk group AAML1831')
p3 = plot_roc_auc(df_cog, 'os.evnt', model_name, risk_group='Risk Group AAML1831', sum_models=True, title='Risk group AAML1831 + Epigenomic Risk')
# Binary model
model_name = 'AML Epigenomic Risk'
p4 = plot_roc_auc(df_cog, 'os.evnt', model_name ,risk_group='Risk Group')
p5 = plot_roc_auc(df_cog, 'os.evnt', model_name, risk_group='Risk Group AAML1831')
p6 = plot_roc_auc(df_cog, 'os.evnt', model_name, risk_group='Risk Group AAML1831', sum_models=True)
# Create a gridplot
p = gridplot([
[p1, p2, p3,],
[p4, p5, p6,],
], toolbar_location='above')
show(p)
Show code cell output
AL epigenomic phenotype#
Show code cell source
## TODO
# from sklearn.metrics import roc_curve, auc
# from sklearn.preprocessing import label_binarize
# from itertools import cycle
# from bokeh.plotting import figure, show
# from bokeh.io import output_notebook
# import pandas as pd
# import numpy as np
# def plot_multiclass_roc_auc(df, target, model_names, title=None):
# """
# Plots ROC AUC for a multiclass classifier using Bokeh, handling non-integer class labels.
# Parameters:
# - df: pandas DataFrame containing model predictions as columns and actual target variable.
# - target: Name of the column containing the actual target variable.
# - model_names: List of column names containing the model predictions for each class.
# - title: Title of the plot.
# """
# # Convert target to binary (one-hot encoding)
# classes = df[target].unique()
# y = label_binarize(df[target], classes=classes)
# n_classes = y.shape[1]
# # Setup plot
# p = figure(title=title,
# x_axis_label='False Positive Rate',
# y_axis_label='True Positive Rate',
# width=1500, height=500,
# tools='save,reset,pan,zoom_in,zoom_out')
# p.line([0, 1], [0, 1], line_dash="dashed", color="gray", line_width=1)
# # Colors for each line
# colors = get_custom_color_palette()
# for i, color in zip(range(n_classes), colors):
# # Prepare true and predicted values
# true = y[:, i]
# # Assuming each model_name is now a single column with prediction probabilities for class i
# for model_name in model_names:
# predicted = df[model_name] # Corrected to directly access model predictions
# fpr, tpr, _ = roc_curve(true, predicted)
# roc_auc = auc(fpr, tpr)
# p.line(fpr, tpr, legend_label=f"Class {i} ({model_name}) AUC = {roc_auc:.2f}",
# color=color, line_width=2)
# p.legend.location = "bottom_right"
# p.legend.click_policy="hide"
# p.toolbar.logo = None
# p.legend.label_text_font_size = '8pt'
# return p
# multiclass_matrix = df_dx[['WHO 2022 Diagnosis', 'AL Epigenomic Phenotype_int']]
# plot = plot_multiclass_roc_auc(df = multiclass_matrix, target = 'WHO 2022 Diagnosis',
# model_names = ['AL Epigenomic Phenotype_int'])
# show(plot)
Box Plots#
Show code cell source
draw_boxplot(df=df_test,x='Risk Group', y='P(High Risk)',
order=['High Risk', 'Standard Risk', 'Low Risk'],
trialname='StJude trials', hue=model_name,
save_plot=False, figsize=(4,3))
draw_boxplot(df=df_test,x='MRD 1 Status', y='P(High Risk)',
order=['Positive','Negative'],
trialname='StJude trials', hue=model_name,
save_plot=False, figsize=(4,3))
draw_boxplot(df=df_test,x='Primary Cytogenetic Code', y='P(High Risk)',
order='auto',
trialname='StJude trials', hue=model_name,
save_plot=False, figsize=(4,3))
Show code cell output
Stacked Bar Plots#
Show code cell source
model_name = 'AML Epigenomic Risk'
draw_stacked_barplot(df=df_test,x='MRD 1 Status', y=model_name,
order=['Positive','Negative'],
trialname='StJude trials', hue=model_name,
save_plot=False, figsize=(4,3))
draw_stacked_barplot(df=df_test,x='Risk Group', y=model_name,
order=['High Risk', 'Standard Risk', 'Low Risk'],
trialname='StJude trials', hue=model_name,
save_plot=False, figsize=(4,3), fontsize=9)
draw_stacked_barplot(df=df_test,x='Primary Cytogenetic Code', y=model_name,
order='auto',
trialname='StJude trials', hue=model_name,
save_plot=False, figsize=(4,3), fontsize=6)
Show code cell output
Sankey plots#
Note
Sankey plots below compare the distribution of categories. The width of the lines is proportional to the number of patients in each group.
Samples with annotated diagnosis info#
Show code cell source
colors = get_custom_color_palette()
draw_sankey_plot(df_train, 'WHO 2022 Diagnosis', 'AL Epigenomic Phenotype', colors,
title='Discovery cohort', fig_size=(4, 10),
fontsize=9, nan_action='drop')
draw_sankey_plot(df_cog, 'WHO 2022 Diagnosis', 'AL Epigenomic Phenotype', colors,
title= 'Discovery cohort (COG peds AML Dx samples only)',fig_size=(4, 8),
fontsize=9, nan_action='drop')
draw_sankey_plot(df_test, 'WHO 2022 Diagnosis', 'AL Epigenomic Phenotype', colors,
title= 'Validation cohort',fig_size=(3, 6),
fontsize=9, nan_action='drop')
Show code cell output
Predictions in samples for which no WHO 22 Dx data was available#
Show code cell source
draw_sankey_plot(df_train, 'WHO 2022 Diagnosis', 'AL Epigenomic Phenotype', colors,
title='Discovery cohort', fig_size=(4, 10),
fontsize=9, nan_action='keep only')
draw_sankey_plot(df_cog, 'WHO 2022 Diagnosis', 'AL Epigenomic Phenotype', colors,
title= 'Discovery cohort (COG peds AML Dx samples only)',fig_size=(4, 8),
fontsize=9, nan_action='keep only')
draw_sankey_plot(df_test, 'WHO 2022 Diagnosis', 'AL Epigenomic Phenotype', colors,
title= 'Validation cohort',fig_size=(4, 8),
fontsize=9, nan_action='keep only')
Show code cell output
Reason for unclassified samples#
Show code cell source
draw_sankey_plot(df_train, 'WHO 2022 Diagnosis', 'Primary Cytogenetic Code', colors,
title='Discovery cohort', fig_size=(4, 10),
fontsize=9, nan_action='keep only')
draw_sankey_plot(df_cog, 'WHO 2022 Diagnosis', 'Gene Fusion', colors,
title= 'Discovery cohort (COG peds AML Dx samples only)',fig_size=(4, 10),
fontsize=9, nan_action='keep only')
draw_sankey_plot(df_test, 'WHO 2022 Diagnosis', 'Primary Cytogenetic Code', colors,
title= 'Validation cohort',fig_size=(4, 8),
fontsize=9, nan_action='keep only')
Show code cell output
Risk group comparison in COG#
Show code cell source
draw_sankey_plot(df_cog, 'Risk Group', 'Risk Group AAML1831', colors,
title= 'Discovery cohort (COG peds AML Dx samples only)',fig_size=(2, 4),
fontsize=9, nan_action='drop')
draw_sankey_plot(df_cog, 'Risk Group AAML1831', 'AML Epigenomic Risk', colors,
title= 'Discovery cohort (COG peds AML Dx samples only)',fig_size=(2, 4),
fontsize=9, nan_action='drop')
Show code cell output
Px and Dx model comparison#
Show code cell source
draw_sankey_plot(df_train, 'AML Epigenomic Risk', 'AL Epigenomic Phenotype', colors,
title='Discovery cohort', fig_size=(3, 10),
fontsize=9, nan_action='drop')
draw_sankey_plot(df_cog, 'AML Epigenomic Risk', 'AL Epigenomic Phenotype', colors,
title= 'Discovery cohort (COG peds AML Dx samples only)',fig_size=(3, 10),
fontsize=9, nan_action='drop')
draw_sankey_plot(df_test, 'AML Epigenomic Risk', 'AL Epigenomic Phenotype', colors,
title= 'Validation cohort',fig_size=(3, 8),
fontsize=9, nan_action='drop')
Show code cell output